Introduction

This R notebook is a rapid analysi of the cohort data produced as part of the data synthesis of citizen science projects

Data

Lets load in the data, it is currently seperate sheets in an excel file. Lets pull out the seperate sheets and assign to data.frames

summary <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 1)
metadata <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 2)
retention_tasks <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 3)
retention_time <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 4)
ppt_inequality <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 5)

I think I need to spread the cohort data so that time is represented in columns

ret <- pivot_wider(data = retention_time, names_from = 'Session', values_from = 'NumberOfPeople')
head(ret)
## # A tibble: 6 x 28
##   ProjectCode Cohort   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CNC_SF           1   779   211   182   153    NA    NA    NA    NA    NA    NA
## 2 CNC_SF           2    NA  1536   385   299    NA    NA    NA    NA    NA    NA
## 3 CNC_SF           3    NA    NA  1697   450    NA    NA    NA    NA    NA    NA
## 4 CNC_SF           4    NA    NA    NA  2076    NA    NA    NA    NA    NA    NA
## 5 CNC_LA           1   891   156   158   126    NA    NA    NA    NA    NA    NA
## 6 CNC_LA           2    NA   732   215   161    NA    NA    NA    NA    NA    NA
## # ... with 16 more variables: `11` <dbl>, `12` <dbl>, `13` <dbl>, `14` <dbl>,
## #   `15` <dbl>, `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>,
## #   `21` <dbl>, `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>
# easy to view but not he format for ggplot

retention_time$CohortSession <- (retention_time$Session + 1) - retention_time$Cohort 
retention_time$ProjectCohort <- paste(retention_time$ProjectCode, 
                                      retention_time$Cohort,
                                      sep = '_')
retention_time$ProjectName <- NA
retention_time$ProjectName <- summary$ProjectName[match(retention_time$ProjectCode, summary$ProjectCode)]
retention_time$ProjectName[is.na(retention_time$ProjectName)] <-
  as.character(retention_time$ProjectCode[is.na(retention_time$ProjectName)])

That works. So now can we do an initial visualisation of the data

p <- ggplot(retention_time, aes(x = CohortSession, y = NumberOfPeople, group = ProjectCohort)) +
  geom_line(aes(colour = ProjectName)) +
  theme(legend.position = "none") 
ggplotly(p, tooltip = c('group'))
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Okay now we need to average across cohorts in the same project and rescale so they all start with the same value

# Rescale
rescale <- function(x){
  
  (x/max(x)) * 100
  
}

retention_time$NumberOfPeopleRescaled <- NA

for(i in unique(retention_time$ProjectCohort)){
  
  retention_time$NumberOfPeopleRescaled[retention_time$ProjectCohort == i] <-
    rescale(retention_time$NumberOfPeople[retention_time$ProjectCohort == i])
  
}


p <- ggplot(retention_time, aes(x = CohortSession, y = NumberOfPeopleRescaled, group = ProjectCohort)) +
  geom_line(aes(colour = ProjectName)) +
  theme(legend.position = "none") 
ggplotly(p, tooltip = c('group'))

And now average across projects

# group and average
av_ret <- tapply(retention_time$NumberOfPeopleRescaled,
                 INDEX = list(retention_time$ProjectCode,
                              retention_time$CohortSession),
                 FUN = mean)

# Put into long format
av_ret <- melt(av_ret, varnames = c('ProjectCode', 'CohortSession'), value.name = 'NumberOfPeopleRescaled')

av_ret$ProjectName <- summary$ProjectName[match(av_ret$ProjectCode, summary$ProjectCode)]
av_ret$ProjectName[is.na(av_ret$ProjectName)] <-
  as.character(av_ret$ProjectCode[is.na(av_ret$ProjectName)])

p <- ggplot(av_ret, aes(x = CohortSession, y = NumberOfPeopleRescaled,
                        group = ProjectName)) +
  geom_line(aes(colour = ProjectName)) +
  geom_point(aes(colour = ProjectName)) +
  scale_y_continuous(trans='log10') +
  theme(legend.position = "none") 
ggplotly(p, tooltip = c('group'))
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

After the first year these actually look very similar

retention_time$NumberOfPeopleRescaled <- NA

retention_time <- retention_time[retention_time$CohortSession != 1,]

for(i in unique(retention_time$ProjectCohort)){
  
  retention_time$NumberOfPeopleRescaled[retention_time$ProjectCohort == i] <-
    rescale(retention_time$NumberOfPeople[retention_time$ProjectCohort == i])
  
}

# group and average
av_ret2 <- tapply(retention_time$NumberOfPeopleRescaled,
                 INDEX = list(retention_time$ProjectCode,
                              retention_time$CohortSession),
                 FUN = mean)

# Put into long format
av_ret2 <- melt(av_ret2, varnames = c('ProjectCode', 'CohortSession'), value.name = 'NumberOfPeopleRescaled')

av_ret2$ProjectName <- 
  summary$ProjectName[match(av_ret2$ProjectCode,
                            summary$ProjectCode)]
av_ret2$ProjectName[is.na(av_ret2$ProjectName)] <-
  as.character(av_ret2$ProjectCode[is.na(av_ret2$ProjectName)])

p <- ggplot(av_ret2, aes(x = CohortSession, y = NumberOfPeopleRescaled,
                        group = ProjectName)) +
  geom_line(aes(colour = ProjectName)) +
  geom_point(aes(colour = ProjectName)) +
  scale_y_continuous(trans = 'log10') +
  theme(legend.position = "none") 
ggplotly(p, tooltip = c('group'))
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis